Doing tests on the full dataset 1. Make new file with full dataset like for Palgen workflow 2. Make graphs with median of each donor for all time points - CD66, CD45, etc. 3. Model a spline regression for each marker on all timepoints

Load packages.

library("CytoGLMM")
## Warning: replacing previous import 'MASS::select' by 'dplyr::select' when
## loading 'CytoGLMM'
## Warning: replacing previous import 'cowplot::ggsave' by 'ggplot2::ggsave'
## when loading 'CytoGLMM'
## Warning: replacing previous import 'stringr::boundary' by
## 'strucchange::boundary' when loading 'CytoGLMM'
library("tidyverse")
library("magrittr")
library("FlowRepositoryR")
library("flowCore")
library("openCyto")
library("ggcyto")
library("scales")
library("parallel")
library("RColorBrewer")
library("ggcorrplot")
library("SummarizedExperiment")
library("lme4")
library("lmerTest")
library(mgcv)
# setting computational resources
ncores = parallel::detectCores()

0.1 Data formatting - sample table

After downloading the data, a sample table is created by parsing the fcs filenames.

fcs_filesVA2 = list.files(path = "FlowRepository_FR-FCM-ZYPY_files", pattern = "fcs")

#Make new column with three levels related to term
map_term = function(x) {
  if (str_detect(x, "BPD019H00|PPD000H00")) "Before-prime" 
  else if (str_detect(x, "PPD000H03|PPD000H06|PPD001H00|PPD003H00|PPD014H00|PBD000H00")) "Post-prime"
  else if (str_detect(x, "PBD000H03|PBD000H06|PBD001H00|PBD003H00")) "Post-boost"
  else NA
}

sample_tableVA2 = tibble(
  donor = str_extract(fcs_filesVA2, "_B[B-D]{1}..."),
  term = str_extract(fcs_filesVA2, "..D0..H0.") %>% as.factor,
  grouped_term = sapply(fcs_filesVA2, map_term) %>% as.factor,
  file_name = paste0("FlowRepository_FR-FCM-ZYPY_files/",fcs_filesVA2)
)
sample_tableVA2$donor = gsub("_", "", sample_tableVA2$donor)

sample_tableVA2
## # A tibble: 43 x 4
##    donor term      grouped_term file_name                                  
##    <chr> <fct>     <fct>        <chr>                                      
##  1 BB078 BPD019H00 Before-prime FlowRepository_FR-FCM-ZYPY_files/BPD019H00…
##  2 BB231 BPD019H00 Before-prime FlowRepository_FR-FCM-ZYPY_files/BPD019H00…
##  3 BC641 BPD019H00 Before-prime FlowRepository_FR-FCM-ZYPY_files/BPD019H00…
##  4 BD620 BPD019H00 Before-prime FlowRepository_FR-FCM-ZYPY_files/BPD019H00…
##  5 BB078 PBD000H00 Post-prime   FlowRepository_FR-FCM-ZYPY_files/PBD000H00…
##  6 BB231 PBD000H00 Post-prime   FlowRepository_FR-FCM-ZYPY_files/PBD000H00…
##  7 BD620 PBD000H00 Post-prime   FlowRepository_FR-FCM-ZYPY_files/PBD000H00…
##  8 BB078 PBD000H03 Post-boost   FlowRepository_FR-FCM-ZYPY_files/PBD000H03…
##  9 BB231 PBD000H03 Post-boost   FlowRepository_FR-FCM-ZYPY_files/PBD000H03…
## 10 BC641 PBD000H03 Post-boost   FlowRepository_FR-FCM-ZYPY_files/PBD000H03…
## # … with 33 more rows

0.2 Marker table

A marker table is created by extracting the marker isotopes and protein names. A third column is added to determine whether the markers were used for identifying cells (phenotype), are functional proteins, or were unused. Palgen et al. (2019) state in their paper that due to low reactivity multiple markers needed to be excluded from the analysis.

# names of markers used for gating extracted from the paper
map_type = function(x) {
  if (str_detect(x, paste(c("CD66", "HLADR", "CD3", "CD107a", "CD8", "CD45", "GranzymeB", "CD56", "CD62L", "CD4", "CD11a", "CD2", "CD7", "NKG2D", "CD11c", "CD69", "CD25", "CD16", "CCR5", "CXCR4", "CD14", "perforine", "NKG2A", "CD20", "CCR7"),collapse = '|'))) "phenotype"
  else if (str_detect(x, paste(c("Di", "Time", "Cell_length", "cells", "File Number"),collapse = '|'))) "unused"
  else "function"
}

#Creating marker table
y <- sample_tableVA2$file_name[1]
fcs = read.FCS(y, transformation = FALSE)

isotope = pData(parameters(fcs))[,"name"]
protein_name = pData(parameters(fcs))[,"desc"]
type = sapply(protein_name, map_type)
tb_marker = tibble(isotope, protein_name, type)
tb_marker
## # A tibble: 45 x 3
##    isotope     protein_name type     
##    <I<chr>>    <I<chr>>     <chr>    
##  1 Time        Time         unused   
##  2 Cell_length Cell_length  unused   
##  3 (Rh103)Di   (Rh103)Di    unused   
##  4 (Xe131)Di   (Xe131)Di    unused   
##  5 (Cs133)Di   (Cs133)Di    unused   
##  6 (La139)Di   (La139)Di    unused   
##  7 (Ce140)Di   (Ce140)Di    unused   
##  8 (Pr141)Di   CD66         phenotype
##  9 (Nd142)Di   HLADR        phenotype
## 10 (Nd143)Di   CD3          phenotype
## # … with 35 more rows
#deleting before last two rows, 43 & 44 as their protein_name was 'cells' 
tb_marker <- tb_marker[-c(43,44), ]
tb_marker
## # A tibble: 43 x 3
##    isotope     protein_name type     
##    <I<chr>>    <I<chr>>     <chr>    
##  1 Time        Time         unused   
##  2 Cell_length Cell_length  unused   
##  3 (Rh103)Di   (Rh103)Di    unused   
##  4 (Xe131)Di   (Xe131)Di    unused   
##  5 (Cs133)Di   (Cs133)Di    unused   
##  6 (La139)Di   (La139)Di    unused   
##  7 (Ce140)Di   (Ce140)Di    unused   
##  8 (Pr141)Di   CD66         phenotype
##  9 (Nd142)Di   HLADR        phenotype
## 10 (Nd143)Di   CD3          phenotype
## # … with 33 more rows

As claimed by the authors (Palgen et al., 2019) the data has been normalized and gated. The cells contained are assumed to be natural killer (NK) cells as the paper only focuses on NK cells.

0.2.1 Combine the sample and marker dataset

# load data
fset2 = read.ncdfFlowSet(sample_tableVA2$file_name, mc.cores = 2) 
pData(fset2) = cbind(pData(fset2),sample_tableVA2)
df_samples2 = lapply(seq(fset2), function(sample_id) {
    marker_ids = which(fset2@colnames %in% tb_marker$isotope)
    exprs = as_tibble(exprs(fset2[[sample_id]]))[,marker_ids]
    file_name = pData(fset2[sample_id])$file_name
    exprs %>% add_column(file_name)
  }) %>% bind_rows
str(df_samples2)
## Classes 'tbl_df', 'tbl' and 'data.frame':    7262238 obs. of  44 variables:
##  $ Time       : num  14 50 74 121 122 180 191 196 211 242 ...
##  $ Cell_length: num  20 34 18 21 22 19 28 18 29 20 ...
##  $ (Rh103)Di  : num  -0.107 -0.814 -0.608 -0.799 -0.655 ...
##  $ (Xe131)Di  : num  -0.533 -0.3947 -0.6889 -0.4583 -0.0429 ...
##  $ (Cs133)Di  : num  -0.75592 -0.73083 -0.00585 -0.38208 -0.18158 ...
##  $ (La139)Di  : num  1.866 0.689 -0.129 -0.136 -0.23 ...
##  $ (Ce140)Di  : num  -0.0534 -0.0871 -0.1385 0.7735 -0.4707 ...
##  $ (Pr141)Di  : num  27.112 44.846 29.033 2.055 0.689 ...
##  $ (Nd142)Di  : num  6.687 66.422 -0.378 7.603 -0.722 ...
##  $ (Nd143)Di  : num  2.87 4.01 2.16 1.88 14 ...
##  $ (Nd144)Di  : num  0.356 80.026 -0.357 69.208 7.077 ...
##  $ (Nd145)Di  : num  -0.51 4.095 -0.286 4.316 0.012 ...
##  $ (Nd146)Di  : num  14.77 5.04 7.51 2.72 10.73 ...
##  $ (Sm147)Di  : num  -0.317 1.608 1.358 -0.53 3.5 ...
##  $ (Nd148)Di  : num  0.257 5.051 1.246 0.83 0.429 ...
##  $ (Sm149)Di  : num  -0.117 35.481 -0.848 4.763 2.298 ...
##  $ (Nd150)Di  : num  2.2454 4.1198 0.7394 1.9436 -0.0443 ...
##  $ (Eu151)Di  : num  -0.95 -0.313 -0.64 -0.655 -0.898 ...
##  $ (Sm152)Di  : num  -0.512 5.703 0.813 2.899 4.556 ...
##  $ (Eu153)Di  : num  12.2 18.39 5.6 134.34 3.82 ...
##  $ (Sm154)Di  : num  10.77 -0.252 -0.55 1.098 42.318 ...
##  $ (Gd155)Di  : num  -0.954 -0.265 0.108 -0.29 11.992 ...
##  $ (Gd156)Di  : num  -0.84 -0.153 0.767 -0.428 -0.232 ...
##  $ (Gd158)Di  : num  -0.3801 11.2318 -0.7972 -0.0977 -0.8671 ...
##  $ (Tb159)Di  : num  -0.909 0.192 2.478 -0.414 -0.341 ...
##  $ (Gd160)Di  : num  -0.179 1.671 -0.468 -0.484 -0.649 ...
##  $ (Dy161)Di  : num  1.231 0.455 2.82 0.308 -0.717 ...
##  $ (Dy162)Di  : num  10.425 0.971 3.853 36.949 -0.747 ...
##  $ (Dy163)Di  : num  -0.3526 2.1858 1.8362 -0.2632 -0.0234 ...
##  $ (Dy164)Di  : num  1.991 4.39 1.224 0.611 -0.828 ...
##  $ (Ho165)Di  : num  0.241 1.261 0.587 -0.884 -0.879 ...
##  $ (Er166)Di  : num  10.22 19.1 3.66 4.08 4.82 ...
##  $ (Er167)Di  : num  2.834 10.3 4.401 35.658 0.948 ...
##  $ (Er168)Di  : num  4.74 23.66 1.67 2.37 9.24 ...
##  $ (Tm169)Di  : num  0.85 14.67 3.41 4.19 1.01 ...
##  $ (Er170)Di  : num  3.6009 24.8262 0.0821 -0.4231 -0.5585 ...
##  $ (Yb171)Di  : num  2.072 7.521 0.244 3.787 1.558 ...
##  $ (Yb172)Di  : num  -0.501 3.983 0.436 9.772 -0.371 ...
##  $ (Yb173)Di  : num  -0.575 -0.293 -0.654 0.223 -0.303 ...
##  $ (Yb174)Di  : num  13.2755 0.4976 5.2598 0.0817 0.5372 ...
##  $ (Lu175)Di  : num  6.017 17.21 6.324 0.391 2.42 ...
##  $ (Lu176)Di  : num  1.68 4.63 1.91 1.85 2.05 ...
##  $ FileNum    : num  0.961 1.088 0.958 0.91 0.991 ...
##  $ file_name  : chr  "FlowRepository_FR-FCM-ZYPY_files/BPD019H00_BB078.fcs" "FlowRepository_FR-FCM-ZYPY_files/BPD019H00_BB078.fcs" "FlowRepository_FR-FCM-ZYPY_files/BPD019H00_BB078.fcs" "FlowRepository_FR-FCM-ZYPY_files/BPD019H00_BB078.fcs" ...

0.2.2 Re-naming columns

df_samples2 %<>% inner_join(sample_tableVA2,by = "file_name")
oldnames = tb_marker$isotope
newnames = tb_marker$protein_name
df_samples2 %<>% rename_at(vars(oldnames), ~ newnames)
str(df_samples2)
## Classes 'tbl_df', 'tbl' and 'data.frame':    7262238 obs. of  47 variables:
##  $ Time        : num  14 50 74 121 122 180 191 196 211 242 ...
##  $ Cell_length : num  20 34 18 21 22 19 28 18 29 20 ...
##  $ (Rh103)Di   : num  -0.107 -0.814 -0.608 -0.799 -0.655 ...
##  $ (Xe131)Di   : num  -0.533 -0.3947 -0.6889 -0.4583 -0.0429 ...
##  $ (Cs133)Di   : num  -0.75592 -0.73083 -0.00585 -0.38208 -0.18158 ...
##  $ (La139)Di   : num  1.866 0.689 -0.129 -0.136 -0.23 ...
##  $ (Ce140)Di   : num  -0.0534 -0.0871 -0.1385 0.7735 -0.4707 ...
##  $ CD66        : num  27.112 44.846 29.033 2.055 0.689 ...
##  $ HLADR       : num  6.687 66.422 -0.378 7.603 -0.722 ...
##  $ CD3         : num  2.87 4.01 2.16 1.88 14 ...
##  $ CD107a      : num  0.356 80.026 -0.357 69.208 7.077 ...
##  $ CD8         : num  -0.51 4.095 -0.286 4.316 0.012 ...
##  $ CD45        : num  14.77 5.04 7.51 2.72 10.73 ...
##  $ IL4         : num  -0.317 1.608 1.358 -0.53 3.5 ...
##  $ GranzymeB   : num  0.257 5.051 1.246 0.83 0.429 ...
##  $ CD56        : num  -0.117 35.481 -0.848 4.763 2.298 ...
##  $ CD62L       : num  2.2454 4.1198 0.7394 1.9436 -0.0443 ...
##  $ (Eu151)Di   : num  -0.95 -0.313 -0.64 -0.655 -0.898 ...
##  $ CD4         : num  -0.512 5.703 0.813 2.899 4.556 ...
##  $ CD11a       : num  12.2 18.39 5.6 134.34 3.82 ...
##  $ CD2         : num  10.77 -0.252 -0.55 1.098 42.318 ...
##  $ CD7         : num  -0.954 -0.265 0.108 -0.29 11.992 ...
##  $ MIP1B       : num  -0.84 -0.153 0.767 -0.428 -0.232 ...
##  $ (Gd158)Di   : num  -0.3801 11.2318 -0.7972 -0.0977 -0.8671 ...
##  $ TNFa        : num  -0.909 0.192 2.478 -0.414 -0.341 ...
##  $ Ki67        : num  -0.179 1.671 -0.468 -0.484 -0.649 ...
##  $ NKG2D       : num  1.231 0.455 2.82 0.308 -0.717 ...
##  $ CD11c       : num  10.425 0.971 3.853 36.949 -0.747 ...
##  $ (Dy163)Di   : num  -0.3526 2.1858 1.8362 -0.2632 -0.0234 ...
##  $ CD69        : num  1.991 4.39 1.224 0.611 -0.828 ...
##  $ IFNg        : num  0.241 1.261 0.587 -0.884 -0.879 ...
##  $ CD25        : num  10.22 19.1 3.66 4.08 4.82 ...
##  $ CD16        : num  2.834 10.3 4.401 35.658 0.948 ...
##  $ CCR5        : num  4.74 23.66 1.67 2.37 9.24 ...
##  $ CXCR4       : num  0.85 14.67 3.41 4.19 1.01 ...
##  $ CD14        : num  3.6009 24.8262 0.0821 -0.4231 -0.5585 ...
##  $ perforine   : num  2.072 7.521 0.244 3.787 1.558 ...
##  $ NKG2A       : num  -0.501 3.983 0.436 9.772 -0.371 ...
##  $ (Yb173)Di   : num  -0.575 -0.293 -0.654 0.223 -0.303 ...
##  $ CD20        : num  13.2755 0.4976 5.2598 0.0817 0.5372 ...
##  $ CCR7        : num  6.017 17.21 6.324 0.391 2.42 ...
##  $ IL10        : num  1.68 4.63 1.91 1.85 2.05 ...
##  $ File Number : num  0.961 1.088 0.958 0.91 0.991 ...
##  $ file_name   : chr  "FlowRepository_FR-FCM-ZYPY_files/BPD019H00_BB078.fcs" "FlowRepository_FR-FCM-ZYPY_files/BPD019H00_BB078.fcs" "FlowRepository_FR-FCM-ZYPY_files/BPD019H00_BB078.fcs" "FlowRepository_FR-FCM-ZYPY_files/BPD019H00_BB078.fcs" ...
##  $ donor       : chr  "BB078" "BB078" "BB078" "BB078" ...
##  $ term        : Factor w/ 12 levels "BPD019H00","PBD000H00",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ grouped_term: Factor w/ 3 levels "Before-prime",..: 1 1 1 1 1 1 1 1 1 1 ...
#factor
df_samples2$term %<>% factor(levels = c("BPD019H00", "PPD000H00", "PPD000H03", "PPD000H06", "PPD001H00", "PPD003H00", "PPD014H00", "PBD000H00", "PBD000H03", "PBD000H06", "PBD001H00", "PBD003H00"))
df_samples2$grouped_term %<>% factor(levels = c("Before-prime", "Post-prime", "Post-boost"))

Cell counts listed per donor and condition

table(df_samples2$donor,df_samples2$term)
##        
##         BPD019H00 PPD000H00 PPD000H03 PPD000H06 PPD001H00 PPD003H00
##   BB078    115117     74607    178094    163789    170962    123565
##   BB231     84527     93135    153506    228105    220444    123631
##   BC641    101816         0    135239    172985    150975    127794
##   BD620     67316     67358    120086     93624    135219         0
##        
##         PPD014H00 PBD000H00 PBD000H03 PBD000H06 PBD001H00 PBD003H00
##   BB078    188553    170024    243760    206027    299428    106990
##   BB231    155249    222997    310251    330010    282710    191831
##   BC641    192715         0     96078    264225    359989         0
##   BD620    123768    149477         0    171421    151180    143661

0.2.3 Selecting used proteins only & transforming data

#all used proteins
protein_names_all = tb_marker %>% 
  dplyr::filter(type != "unused") %>%
  .$protein_name
protein_names_all
##  [1] "CD66"      "HLADR"     "CD3"       "CD107a"    "CD8"      
##  [6] "CD45"      "IL4"       "GranzymeB" "CD56"      "CD62L"    
## [11] "CD4"       "CD11a"     "CD2"       "CD7"       "MIP1B"    
## [16] "TNFa"      "Ki67"      "NKG2D"     "CD11c"     "CD69"     
## [21] "IFNg"      "CD25"      "CD16"      "CCR5"      "CXCR4"    
## [26] "CD14"      "perforine" "NKG2A"     "CD20"      "CCR7"     
## [31] "IL10"
#declare columns that are not protein markers
sample_info_names = c(names(sample_tableVA2))

#transform
trans_func = function(x) asinh(x/5)
df_samples_tfm2 = df_samples2 %>% mutate_at(protein_names_all, trans_func)

0.2.4 Subsample cells to a maximum number of cells per donor.

Subsample data, as models on full data are computationally intensive and models are very similar.

ncells = 1000
if(nrow(df_samples_tfm2) > ncells) {
  print(paste("subsampled to",ncells,"per donor"))
  set.seed(2019)
  # subsample depending on max cell count
  df_count = df_samples_tfm2 %>% group_by(donor) %>% tally() %>%
    mutate(nnew = ifelse(n > ncells,ncells,n))
  # create table with a data frame in one column
  df_nested = df_samples_tfm2 %>% group_by(donor) %>% nest() %>%
    left_join(df_count,by = "donor")
  # subsample per donor
  df_samples_tfm2 = df_nested %>%
    mutate(samp = map2(data, nnew, sample_n)) %>%
    dplyr::select(donor, samp) %>%
    unnest()
} else {
  print("no subsampling done")
}
## [1] "subsampled to 1000 per donor"

0.3 Plot Median Marker Expression

Plot all celltypes.

#defining numeric term for spline regression
df_samples_tfm2$numeric_term = df_samples_tfm2$term
df_samples_tfm2$numeric_term = as.numeric(df_samples_tfm2$numeric_term)

df_samples_tfm2
## # A tibble: 4,000 x 48
##    donor   Time Cell_length `(Rh103)Di` `(Xe131)Di` `(Cs133)Di` `(La139)Di`
##    <chr>  <dbl>       <dbl>       <dbl>       <dbl>       <dbl>       <dbl>
##  1 BB078  50681          24     -0.815      -0.460      -0.455     -0.891  
##  2 BB078 278371          27     -0.277       4.08       -0.338      2.19   
##  3 BB078 419519          21     -0.211      -0.632      -0.0601     2.54   
##  4 BB078 160818          22     -0.314      -0.414      -0.555      0.00716
##  5 BB078 951768          23     -0.122       0.127      -0.0156     1.56   
##  6 BB078 765817          32     -0.0401     -0.456      -0.268     -0.462  
##  7 BB078 703272          20     -0.427      -0.0764     -0.0495     0.0739 
##  8 BB078 111716          20     -0.744      -0.0739     -0.687     -0.106  
##  9 BB078 630364          21     -0.742       1.39       -0.120      4.57   
## 10 BB078  88940          21     -0.360      -0.518      -0.221      1.04   
## # … with 3,990 more rows, and 41 more variables: `(Ce140)Di` <dbl>,
## #   CD66 <dbl>, HLADR <dbl>, CD3 <dbl>, CD107a <dbl>, CD8 <dbl>,
## #   CD45 <dbl>, IL4 <dbl>, GranzymeB <dbl>, CD56 <dbl>, CD62L <dbl>,
## #   `(Eu151)Di` <dbl>, CD4 <dbl>, CD11a <dbl>, CD2 <dbl>, CD7 <dbl>,
## #   MIP1B <dbl>, `(Gd158)Di` <dbl>, TNFa <dbl>, Ki67 <dbl>, NKG2D <dbl>,
## #   CD11c <dbl>, `(Dy163)Di` <dbl>, CD69 <dbl>, IFNg <dbl>, CD25 <dbl>,
## #   CD16 <dbl>, CCR5 <dbl>, CXCR4 <dbl>, CD14 <dbl>, perforine <dbl>,
## #   NKG2A <dbl>, `(Yb173)Di` <dbl>, CD20 <dbl>, CCR7 <dbl>, IL10 <dbl>,
## #   `File Number` <dbl>, file_name <chr>, term <fct>, grouped_term <fct>,
## #   numeric_term <dbl>
#all proteins
df_median_fct2 = df_samples_tfm2 %>%
      group_by(file_name, donor, term, grouped_term, numeric_term) %>%
      summarise_at(protein_names_all, median)

df_median_fct2
## # A tibble: 43 x 36
## # Groups:   file_name, donor, term, grouped_term [43]
##    file_name donor term  grouped_term numeric_term  CD66  HLADR   CD3
##    <chr>     <chr> <fct> <fct>               <dbl> <dbl>  <dbl> <dbl>
##  1 FlowRepo… BB078 BPD0… Before-prime            1 2.67  0.202  0.604
##  2 FlowRepo… BB231 BPD0… Before-prime            1 2.64  0.147  0.354
##  3 FlowRepo… BC641 BPD0… Before-prime            1 0.915 0.0607 0.534
##  4 FlowRepo… BD620 BPD0… Before-prime            1 2.93  0.531  0.742
##  5 FlowRepo… BB078 PBD0… Post-prime              8 4.07  0.316  0.344
##  6 FlowRepo… BB231 PBD0… Post-prime              8 3.75  0.194  0.206
##  7 FlowRepo… BD620 PBD0… Post-prime              8 3.49  0.653  0.549
##  8 FlowRepo… BB078 PBD0… Post-boost              9 3.90  0.268  0.306
##  9 FlowRepo… BB231 PBD0… Post-boost              9 3.64  0.159  0.125
## 10 FlowRepo… BC641 PBD0… Post-boost              9 3.42  0.536  0.572
## # … with 33 more rows, and 28 more variables: CD107a <dbl>, CD8 <dbl>,
## #   CD45 <dbl>, IL4 <dbl>, GranzymeB <dbl>, CD56 <dbl>, CD62L <dbl>,
## #   CD4 <dbl>, CD11a <dbl>, CD2 <dbl>, CD7 <dbl>, MIP1B <dbl>, TNFa <dbl>,
## #   Ki67 <dbl>, NKG2D <dbl>, CD11c <dbl>, CD69 <dbl>, IFNg <dbl>,
## #   CD25 <dbl>, CD16 <dbl>, CCR5 <dbl>, CXCR4 <dbl>, CD14 <dbl>,
## #   perforine <dbl>, NKG2A <dbl>, CD20 <dbl>, CCR7 <dbl>, IL10 <dbl>

Make combined table to plot everything on same diagram (median markers & spline)

#convert term into timepoints in hours (& into rank)
df_combined2 = df_samples_tfm2 %>% mutate(
  term_hours = case_when(term == "BPD019H00" ~ 0,
                  term == "PPD000H00" ~ 456,
                  term == "PPD000H03" ~ 459,
                  term == "PPD000H06" ~ 462,
                  term == "PPD001H00" ~ 480,
                  term == "PPD003H00" ~ 528,
                  term == "PPD014H00" ~ 792,
                  term == "PBD000H00" ~ 1848,
                  term == "PBD000H03" ~ 1851,
                  term == "PBD000H06" ~ 1854,
                  term == "PBD001H00" ~ 1872,
                  term == "PBD003H00" ~ 1920)
)


#median marker plotting
ggplot(df_combined2, aes(term_hours, CD66)) + 
  geom_point() + geom_smooth(method = gam) +
  theme(axis.text.x = element_text(angle = 45, vjust=0))

#spline regression model & x axis values
model <- gam(CD66 ~ s(numeric_term), data = df_samples_tfm2)
lablist = c("BPD019H00", "PPD000H00", "PPD000H03", "PPD000H06", "PPD001H00", "PPD003H00", "PPD014H00", "PBD000H00", "PBD000H03", "PBD000H06", "PBD001H00", "PBD003H00")

#base R combined plots - make look prettier
plot(df_median_fct2$numeric_term, df_median_fct2$CD66, col=df_median_fct2$grouped_term, main = "Spline regression and donor median markers of CD66 for all timepoints",
     xlab = "time", ylab = "CD66", xaxt="n")
axis(1, at=1:12, labels=FALSE)
text(seq(1, 12, by=1), par("usr")[3] - 0.2, labels = lablist, srt = 45, adj = c(1.1,1.1), pos = 1, xpd = TRUE, cex=.8)
par(new=TRUE)
plot(model, axes = FALSE, ann=FALSE)
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "axes" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "axes" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "axes" is not
## a graphical parameter

plot(df_samples_tfm2$numeric_term, df_samples_tfm2$CD66)
lines(smooth.spline(df_samples_tfm2$numeric_term, df_samples_tfm2$CD66))

plot(df_median_fct2$numeric_term, df_median_fct2$CD66, type = "p", col=df_median_fct2$grouped_term)
lines(smooth.spline(df_samples_tfm2$numeric_term, df_samples_tfm2$CD66))

#lines a bit choppier, each x point is a knot, doesn't change y axis
plot(model)
lines(smooth.spline(df_samples_tfm2$numeric_term, df_samples_tfm2$CD66))

Comparing different types of splines. smooth.spline is a linear more basic spline function. gam is a generalized linear model that corrects for more.

Zoom in on marker CD66

#median marker plotting
ggplot(df_median_fct2, aes(term, CD66, color = grouped_term)) + 
  geom_jitter(width = 0.2, alpha = 0.5) + 
  theme(axis.text.x = element_text(angle = 45, vjust=0))

#median marker plotting per donor
ggplot(df_median_fct2, aes(term, CD66, color = grouped_term)) + 
  geom_jitter(width = 0.2, alpha = 0.5) +
  facet_wrap(~donor) + 
  theme(axis.text.x = element_text(angle = 45, vjust=0)) 

#spline regression
model <- gam(CD66 ~ s(numeric_term), data = df_samples_tfm2)
plot(model,
     main = "Spline regression of protein marker CD66 for all timepoints",
     xlab = "term")

#abline(v = 3, col = "green") - Error "plot.new has not been called yet"

Prime seems to be similar to beginning of prime and then downards slope. Just before boost much higher, then downwards slope again within boost. Much more significant shift, y axis up to 4. ! Dip after prime may also be explained by total NK cell amounts going down authors said after vaccination. Really seems that each injection causes a spike in CD66 and it then slowly goes down,and at boost it’s a much steeper increase then for the prime. Numbers on y axis also much higher than others (up to 4). Interestingly, strong increase happens just before the boost injection. One of the markers where donors are the most alike, less spread.

Looking into CD45

ggplot(df_median_fct2, aes(term, CD45, color = grouped_term)) + 
  geom_jitter(width = 0.2, alpha = 0.5) + 
  theme(axis.text.x = element_text(angle = 45, vjust=0)) + ggtitle("CD45 median marker expressions per donor")

#spline
model <- gam(CD45 ~ s(numeric_term), data = df_samples_tfm2)
plot(model,
     main = "Spline regression of protein marker CD45 for all timepoints",
     xlab = "term")

#combined plot
model <- gam(CD45 ~ s(numeric_term), data = df_samples_tfm2)

plot(df_median_fct2$numeric_term, df_median_fct2$CD45, col=df_median_fct2$grouped_term, main = "Spline regression and donor median markers of CD45 for all timepoints",
     xlab = "time", ylab = "CD45", xaxt="n")
axis(1, at=1:12, labels=FALSE)
text(seq(1, 12, by=1), par("usr")[3] - 0.1, labels = lablist, srt = 45, adj = c(1.1,1.1), pos = 1, xpd = TRUE, cex=.8)
par(new=TRUE)
plot(model, axes = FALSE, ann=FALSE)
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "axes" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "axes" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "axes" is not
## a graphical parameter

#simpler spline regression
plot(df_median_fct2$numeric_term, df_median_fct2$CD45, col=df_median_fct2$grouped_term)
lines(smooth.spline(df_samples_tfm2$numeric_term, df_samples_tfm2$CD45))

CD45 also seems to start off quite high before prime, then go down after prime vaccination, then before boost it is higher again and gently goes down after boost. Y axis goes up to 2. Prime doesn’t seem to stimulate CD45 much initially. Might be a marker that takes longer to take effect.

Looking further into CD56

ggplot(df_median_fct2, aes(term, CD56, color = grouped_term)) + 
  geom_jitter(width = 0.2, alpha = 0.5) +
  theme(axis.text.x = element_text(angle = 45, vjust=0))

ggplot(df_median_fct2, aes(term, CD56, color = grouped_term)) + 
  geom_jitter(width = 0.2, alpha = 0.5) +
  facet_wrap(~donor) + theme_classic() +
  theme(axis.text.x = element_text(angle = 45, vjust=0))

#spline regression
model <- gam(CD56 ~ s(numeric_term), data = df_samples_tfm2)
plot(model,
     main = "Spline regression of protein marker CD56 for all timepoints",
     xlab = "term")

#combined plot
model <- gam(CD56 ~ s(numeric_term), data = df_samples_tfm2)

plot(df_median_fct2$numeric_term, df_median_fct2$CD56, col=df_median_fct2$grouped_term, main = "Spline regression and donor median markers of CD56 for all timepoints",
     xlab = "time", ylab = "CD56", xaxt="n")
axis(1, at=1:12, labels=FALSE)
text(seq(1, 12, by=1), par("usr")[3] - 0.2, labels = lablist, srt = 45, adj = c(1.1,1.1), pos = 1, xpd = TRUE, cex=.8)
par(new=TRUE)
plot(model, axes = FALSE, ann=FALSE)
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "axes" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "axes" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "axes" is not
## a graphical parameter

#simple spline
plot(df_median_fct2$numeric_term, df_median_fct2$CD56, type = "p", col=df_median_fct2$grouped_term)
lines(smooth.spline(df_samples_tfm2$numeric_term, df_samples_tfm2$CD56))

General downwards slope from prime to boost. Large variation in prime amongst donors. Scale only goes up to 1.2 on y axis. Facetwrap interesting as see downards slope better, especially BB078 and BB231 as simply their baseline differs.

Looking into HLADR

ggplot(df_median_fct2, aes(term, HLADR, color = grouped_term)) + 
  geom_jitter(width = 0.2, alpha = 0.5) +
  theme(axis.text.x = element_text(angle = 45, vjust=0))

ggplot(df_median_fct2, aes(term, HLADR, color = grouped_term)) + 
  geom_jitter(width = 0.2, alpha = 0.5) +
  facet_wrap(~donor) + theme_classic() +
  theme(axis.text.x = element_text(angle = 45, vjust=0))

#spline regression
model <- gam(HLADR ~ s(numeric_term), data = df_samples_tfm2)
plot(model,
     main = "Spline regression of protein marker HLADR for all timepoints",
     xlab = "term")

Seems to have slightly similar pattern to CD45, higher in boost than in prime in general, not strong downwards slopes. Y axis quite small ongy going to 0.6.

Looking into CD107a

ggplot(df_median_fct2, aes(term, CD107a, color = grouped_term)) + 
  geom_jitter(width = 0.2, alpha = 0.5) +
  theme(axis.text.x = element_text(angle = 45, vjust=0))

ggplot(df_median_fct2, aes(term, CD107a, color = grouped_term)) + 
  geom_jitter(width = 0.2, alpha = 0.5) +
  facet_wrap(~donor) +
  theme(axis.text.x = element_text(angle = 45, vjust=0))

#spline regression
model <- gam(CD107a ~ s(numeric_term), data = df_samples_tfm2)
plot(model,
     main = "Spline regression of protein marker CD107a for all timepoints",
     xlab = "term")

No clear pattern. Seems to be in general, per donor, higher beginning and at boost end, but inconsistent. Y axis is only up to 0.75.

Looking into perforin

model <- gam(perforine ~ s(numeric_term), data = df_samples_tfm2)

plot(model,
     main = "Spline regression of protein marker perforin for all timepoints",
     xlab = "term")

ggplot(df_median_fct2, aes(term, perforine, color = grouped_term)) + 
  geom_jitter(width = 0.2, alpha = 0.5) +
  theme(axis.text.x = element_text(angle = 45, vjust=0))

ggplot(df_median_fct2, aes(term, perforine, color = grouped_term)) + 
  geom_jitter(width = 0.2, alpha = 0.5) +
  facet_wrap(~donor) +
  theme(axis.text.x = element_text(angle = 45, vjust=0))

0.4 Go through markers that Palgen found and my models did not

Look into CD16

#median marker plotting
ggplot(df_median_fct2, aes(term, CD16, color = grouped_term)) + 
  geom_jitter(width = 0.2, alpha = 0.5) + 
  theme(axis.text.x = element_text(angle = 45, vjust=0))

#median marker plotting per donor
ggplot(df_median_fct2, aes(term, CD16, color = grouped_term)) + 
  geom_jitter(width = 0.2, alpha = 0.5) +
  facet_wrap(~donor) + 
  theme(axis.text.x = element_text(angle = 45, vjust=0)) 

#spline regression
model <- gam(CD16 ~ s(numeric_term), data = df_samples_tfm2)
plot(model,
     main = "Spline regression of protein marker CD16 for all timepoints",
     xlab = "term")

model <- gam(CD16 ~ s(numeric_term), data = df_samples_tfm2)

plot(df_median_fct2$numeric_term, df_median_fct2$CD16, col=df_median_fct2$grouped_term, main = "Spline regression and donor median markers of CD16 for all timepoints",
     xlab = "time", ylab = "CD16", xaxt="n")
axis(1, at=1:12, labels=FALSE)
text(seq(1, 12, by=1), par("usr")[3] - 0.05, labels = lablist, srt = 45, adj = c(1.1,1.1), pos = 1, xpd = TRUE, cex=.8)
par(new=TRUE)
plot(model, axes = FALSE, ann=FALSE)
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "axes" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "axes" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "axes" is not
## a graphical parameter

CD16 seems to clearly go up after the boost condition. The reason I did not find it is that the increase seems to be omst prominent one day after the boost. It is still very low 6 hours after the boost injection.

Zoom in on marker granzyme B

#median marker plotting
ggplot(df_median_fct2, aes(term, GranzymeB, color = grouped_term)) + 
  geom_jitter(width = 0.2, alpha = 0.5) + 
  theme(axis.text.x = element_text(angle = 45, vjust=0))

#median marker plotting per donor
ggplot(df_median_fct2, aes(term, GranzymeB, color = grouped_term)) + 
  geom_jitter(width = 0.2, alpha = 0.5) +
  facet_wrap(~donor) + 
  theme(axis.text.x = element_text(angle = 45, vjust=0)) 

#spline regression
model <- gam(GranzymeB ~ s(numeric_term), data = df_samples_tfm2)
plot(model,
     main = "Spline regression of protein marker GranzymeB for all timepoints",
     xlab = "term")

Granzyme B seems relatively high before vaccination. It then goes down after the prime vaccination, and then seems generally to go bak up in boost vaccination but only too similar levels to before prime. Generally there seems to be a slight increase in 3 out of 4 animals in the boost condition as compared to the prime condition. Granzyme B is highlighted as significant by LLMM and Palgen.

Looking into CD69

#median marker plotting
ggplot(df_median_fct2, aes(term, CD69, color = grouped_term)) + 
  geom_jitter(width = 0.2, alpha = 0.5) + 
  theme(axis.text.x = element_text(angle = 45, vjust=0))

#median marker plotting per donor
ggplot(df_median_fct2, aes(term, CD69, color = grouped_term)) + 
  geom_jitter(width = 0.2, alpha = 0.5) +
  facet_wrap(~donor) + 
  theme(axis.text.x = element_text(angle = 45, vjust=0)) 

#spline regression
model <- gam(CD69 ~ s(numeric_term), data = df_samples_tfm2)
plot(model,
     main = "Spline regression of protein marker CD69 for all timepoints",
     xlab = "term")

#combined plot
model <- gam(CD69 ~ s(numeric_term), data = df_samples_tfm2)

plot(df_median_fct2$numeric_term, df_median_fct2$CD69, col=df_median_fct2$grouped_term, main = "Spline regression and donor median markers of CD69 for all timepoints",
     xlab = "time", ylab = "CD69", xaxt="n")
axis(1, at=1:12, labels=FALSE)
text(seq(1, 12, by=1), par("usr")[3] - 0.1, labels = lablist, srt = 45, adj = c(1.1,1.1), pos = 1, xpd = TRUE, cex=.8)
par(new=TRUE)
plot(model, axes = FALSE, ann=FALSE)
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "axes" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "axes" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "axes" is not
## a graphical parameter

Similar to CD16, it seems that CD69 has it’s peak increase one day after the boost and therefore was not seen as significant by the models used in this thesis.

Looking into CCR5

#median marker plotting
ggplot(df_median_fct2, aes(term, CCR5, color = grouped_term)) + 
  geom_jitter(width = 0.2, alpha = 0.5) + 
  theme(axis.text.x = element_text(angle = 45, vjust=0))

#median marker plotting per donor
ggplot(df_median_fct2, aes(term, CCR5, color = grouped_term)) + 
  geom_jitter(width = 0.2, alpha = 0.5) +
  facet_wrap(~donor) + theme_classic() +
  theme(axis.text.x = element_text(angle = 45, vjust=0)) 

ggsave(filename = "CCR5 facet per donor.jpg", width = 7, height = 4)

#spline regression
model <- gam(CCR5 ~ s(numeric_term), data = df_samples_tfm2)
plot(model,
     main = "Spline regression of protein marker CCR5 for all timepoints",
     xlab = "term")

CCR5 starts off relatively high, has a decrease in the beginning of prime, rises back to original levels just before the prime and then decreases again. It seems that CCR5 is temporarily downregulated by each vaccination, without a lasting effect. Palgen et al.’s conclusion that CCR5 is upregulated in the boost condition does not seem to be the case and the findings of the LLMM model seem more realistic that it is generally downregulated. If the boost condition had been measured longer than just 3 days after the vaccination, the same pattern of quantities going back to baseline may have been observed. They probably found significant due to one outlier.

Looking into CD11c

#median marker plotting
ggplot(df_median_fct2, aes(term, CD11c, color = grouped_term)) + 
  geom_jitter(width = 0.2, alpha = 0.5) + 
  theme(axis.text.x = element_text(angle = 45, vjust=0))

#median marker plotting per donor
ggplot(df_median_fct2, aes(term, CD11c, color = grouped_term)) + 
  geom_jitter(width = 0.2, alpha = 0.5) +
  facet_wrap(~donor) + 
  theme(axis.text.x = element_text(angle = 45, vjust=0)) 

#spline regression
model <- gam(CD11c ~ s(numeric_term), data = df_samples_tfm2)
plot(model,
     main = "Spline regression of protein marker CD11c for all timepoints",
     xlab = "term")

model <- gam(CD11c ~ s(numeric_term), data = df_samples_tfm2)

plot(df_median_fct2$numeric_term, df_median_fct2$CD11c, col=df_median_fct2$grouped_term, main = "Spline regression and donor median markers of CD11c for all timepoints",
     xlab = "time", ylab = "CD11c", xaxt="n")
axis(1, at=1:12, labels=FALSE)
text(seq(1, 12, by=1), par("usr")[3] - 0.1, labels = lablist, srt = 45, adj = c(1.1,1.1), pos = 1, xpd = TRUE, cex=.8)
par(new=TRUE)
plot(model, axes = FALSE, ann=FALSE)
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "axes" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "axes" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "axes" is not
## a graphical parameter

Similarly to CD16 and CD69, it seems that the discrepancy found in Palgen et al.’s results and mine are because the strongest increase in CD11c occured one day after the boost vaccination. It seems to fluctuate around during prime, have a slight decrease just after the boost (3 and 6 hours after), and then rise rather rapidly one day after the boost. The scale here goes up to 0.9, slightly higher than some others, nowhere near as clear as CD66 that went up to 4.

Looking into CCR7

#median marker plotting
ggplot(df_median_fct2, aes(term, CCR7, color = grouped_term)) + 
  geom_jitter(width = 0.2, alpha = 0.5) + 
  theme(axis.text.x = element_text(angle = 45, vjust=0))

#median marker plotting per donor
ggplot(df_median_fct2, aes(term, CCR7, color = grouped_term)) + 
  geom_jitter(width = 0.2, alpha = 0.5) +
  facet_wrap(~donor) + 
  theme(axis.text.x = element_text(angle = 45, vjust=0)) 

#spline regression
model <- gam(CCR7 ~ s(numeric_term), data = df_samples_tfm2)
plot(model,
     main = "Spline regression of protein marker CCR7 for all timepoints",
     xlab = "term")